Data Description

These COVID-19 datasets are collected and maintained through a subsection of Johns Hopkins University. Each day, the total number of confirmed COVID-19 cases and deaths is updated for each country in the world, and each of the states in the US, to provide accurate and updated time series data on the proliferation of the virus. The datasets can be found at the following URL: https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series

Data Cleaning

library(tidyverse)
library(lubridate)
library(dplyr)
library(leaps)
library(plotly)
library(usmap)

url_in <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/"

file_names <- c("time_series_covid19_confirmed_US.csv",
                "time_series_covid19_deaths_US.csv",
                "time_series_covid19_confirmed_global.csv",
                "time_series_covid19_deaths_global.csv")

urls <- str_c(url_in, file_names)
#urls

us_cases <- read_csv(urls[1])
us_deaths <- read_csv(urls[2])
global_cases <- read_csv(urls[3])
global_deaths <- read_csv(urls[4])
#Tidy the data so that each date is on a separate row
global_cases <- global_cases %>%
  pivot_longer(cols = -c(`Province/State`, `Country/Region`, Lat, Long),
               names_to = "date",
               values_to = "cases") %>%
  select(-c(Lat, Long))

#global_cases
global_deaths <- global_deaths %>%
  pivot_longer(cols = -c(`Province/State`, `Country/Region`, Lat, Long),
               names_to = "date",
               values_to = "deaths") %>%
  select(-c(Lat, Long))

#global_deaths
global <- global_cases %>%
  full_join(global_deaths) %>%
  rename(Country_Region = `Country/Region`,
        Province_State = `Province/State`) %>%
  mutate(date = mdy(date))

#global
#summary(global)

global <- global %>%
  filter(cases > 0)
#summary(global)
us_cases <- us_cases %>%
  pivot_longer(cols = -(UID:Combined_Key),
               names_to = "date",
               values_to = "cases") %>%
  select(Admin2:cases) %>%
  mutate(date = mdy(date)) %>%
  select(-c(Lat, Long_))

#us_cases
us_deaths <- us_deaths %>%
  pivot_longer(cols = -(UID:Population),
               names_to = "date",
               values_to = "deaths") %>%
  select(Admin2:deaths) %>%
  mutate(date = mdy(date)) %>%
  select(-c(Lat, Long_))

#us_deaths
US <- us_cases %>%
  full_join(us_deaths)

#US
global <- global %>%
  unite("Combined_Key", 
        c(Province_State, Country_Region),
        sep = ", ",
        na.rm = TRUE,
        remove = FALSE)

#global
uid_url <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv"

uid <- read_csv(uid_url) %>%
  select(-c(Lat, Long_, Combined_Key, code3, iso2, iso3, Admin2))

global <- global %>%
  left_join(uid, by = c("Province_State", "Country_Region")) %>%
  select(-c(UID, FIPS)) %>%
  select(Province_State, Country_Region, date,
         cases, deaths, Population, Combined_Key)
#global
US_by_state <- US %>%
  group_by(Province_State, Country_Region, date) %>%
  summarize(cases = sum(cases), deaths = sum(deaths), Population = sum(Population)) %>%
  mutate(deaths_per_mill = deaths * 1000000 / Population) %>%
  select(Province_State, Country_Region, date, cases, deaths, deaths_per_mill, Population) %>%
  ungroup()

#US_by_state
US_totals <- US_by_state %>%
  group_by(Country_Region, date) %>%
  summarize(cases = sum(cases), deaths = sum(deaths), Population = sum(Population)) %>%
  mutate(deaths_per_mill = deaths * 1000000 / Population) %>%
  select(Country_Region, date, cases, deaths) %>%
  ungroup()

#US_totals

Visualizations

US_totals %>%
  filter(cases > 0 ) %>%
  ggplot(aes(x = date, y = cases)) + 
  geom_line(aes(color = "cases")) +
  geom_point(aes(color = "cases")) +
  geom_line(aes(y = deaths, color = "deaths")) +
  geom_point(aes(y = deaths, color = "deaths")) +
  scale_y_log10() +
  theme(legend.position = "bottom",
        axis.text.x = element_text(angle = 90)) +
  labs(title = "COVID-19 in the US", y = "Total", x = "Date")

The COVID-19 cases and deaths in the United States both continue to rise over time, however, it becomes more difficult to tell after a certain point because of the logarithmic scale of the graph. A more digestible way to examine the COVID-19 case data is to instead look at the number of new cases and new deaths each day, rather than the aggregate cases and deaths.

state = "Virginia"

US_by_state %>%
  filter(Province_State == state ) %>%
  filter(cases > 0 ) %>%
  ggplot(aes(x = date, y = cases)) + 
  geom_line(aes(color = "cases")) +
  geom_point(aes(color = "cases")) +
  geom_line(aes(y = deaths, color = "deaths")) +
  geom_point(aes(y = deaths, color = "deaths")) +
  scale_y_log10() +
  theme(legend.position = "bottom",
        axis.text.x = element_text(angle = 90)) +
  labs(title = "COVID19 in Virginia", y = NULL)

Similar to the national data, the cases and deaths both continue to rise in the state of Virginia. Again, we will leverage the use of new cases and deaths per day to receive a better understanding of whether the spread of COVID-19 is accelerating, decelerating, or holding steady.

max(US_totals$date)
## [1] "2021-06-20"
max(US_totals$deaths)
## [1] 601824
US_by_state <- US_by_state %>%
  mutate(new_cases = cases - lag(cases),
         new_deaths = deaths - lag(deaths))

US_totals <- US_totals %>%
  mutate(new_cases = cases -lag(cases),
         new_deaths = deaths - lag(deaths))

tail(US_totals %>% select(new_cases, new_deaths, everything()))
## # A tibble: 6 x 6
##   new_cases new_deaths Country_Region date          cases deaths
##       <dbl>      <dbl> <chr>          <date>        <dbl>  <dbl>
## 1     11304        308 US             2021-06-15 33486038 600329
## 2     12430        368 US             2021-06-16 33498468 600697
## 3     10399        281 US             2021-06-17 33508867 600978
## 4     20608        593 US             2021-06-18 33529475 601571
## 5      8520        170 US             2021-06-19 33537995 601741
## 6      3892         83 US             2021-06-20 33541887 601824
US_totals %>%
  ggplot(aes(x = date, y = new_cases)) +
  geom_line(aes(color = "new_cases")) +
  geom_point(aes(color = "new_cases")) +
  geom_line(aes(y = new_deaths, color = "new_deaths")) +
  geom_point(aes(y = new_deaths, color = "new_deaths")) +
  scale_y_log10() +
  theme(legend.position = "bottom",
        axis.text.x = element_text(angle = 90)) +
  labs(title = "COVID19 in the US", y = "Totals", x = "Date")

We see that the number of new cases and new deaths per day has been decreasing since the beginning of the 2021 calendar year. Despite this, the new COVID-19 deaths in the US is still close to 1000 per day, and the new COVID-19 cases is around 10,000 a day. The spread of the virus in the US appears to be lessening but it is still spreading rapidly.

state = "Virginia"

US_by_state %>%
  filter(Province_State == state ) %>%
  filter(cases > 0 ) %>%
  ggplot(aes(x = date, y = new_cases)) + 
  geom_line(aes(color = "new_cases")) +
  geom_point(aes(color = "new_cases")) +
  geom_line(aes(y = new_deaths, color = "new_deaths")) +
  geom_point(aes(y = new_deaths, color = "new_deaths")) +
  scale_y_log10() +
  theme(legend.position = "bottom",
        axis.text.x = element_text(angle = 90)) +
  labs(title = "COVID19 in Virginia", y = NULL)

Specifically in Virginia, the volume of new daily cases at the beginning of the year was cresting at nearly 10000, and now it is all the way down to only 100 new cases per day. Likewise, the new daily deaths has dropped to less than 10, so it appears that Virginia is making good headway towards curbing COVID-19.

US_state_totals <- US_by_state %>%
  group_by(Province_State) %>%
  summarize(deaths = max(deaths), cases = max(cases),
            population = max(Population),
            cases_per_thou = 1000 * cases / population,
            deaths_per_thou = 1000 * deaths / population) %>%
  filter(cases > 0, population > 0)

US_state_totals %>%
  slice_min(deaths_per_thou, n = 10)
## # A tibble: 10 x 6
##    Province_State        deaths  cases population cases_per_thou deaths_per_thou
##    <chr>                  <dbl>  <dbl>      <dbl>          <dbl>           <dbl>
##  1 Northern Mariana Isl~      2    183      55144           3.32          0.0363
##  2 Virgin Islands            30   3752     107268          35.0           0.280 
##  3 Hawaii                   513  37353    1415872          26.4           0.362 
##  4 Vermont                  256  24360     623989          39.0           0.410 
##  5 Alaska                   373  70875     740995          95.6           0.503 
##  6 Maine                    854  68826    1344212          51.2           0.635 
##  7 Oregon                  2754 206777    4217737          49.0           0.653 
##  8 Puerto Rico             2540 139753    3754939          37.2           0.676 
##  9 Utah                    2330 411610    3205958         128.            0.727 
## 10 Washington              5856 447203    7614893          58.7           0.769
US_state_totals %>%
  select(deaths_per_thou, cases_per_thou, everything())
## # A tibble: 55 x 6
##    deaths_per_thou cases_per_thou Province_State       deaths   cases population
##              <dbl>          <dbl> <chr>                 <dbl>   <dbl>      <dbl>
##  1           2.31           112.  Alabama               11306  548657    4903185
##  2           0.503           95.6 Alaska                  373   70875     740995
##  3           2.45           122.  Arizona               17843  889727    7278717
##  4           1.95           115.  Arkansas               5874  345605    3017804
##  5           1.60            96.4 California            63335 3808258   39512223
##  6           1.17            96.2 Colorado               6732  553868    5758736
##  7           2.32            97.8 Connecticut            8270  348665    3565287
##  8           1.73           112.  Delaware               1683  109548     973764
##  9           1.62            69.8 District of Columbia   1141   49243     705749
## 10           1.75           110.  Florida               37555 2354416   21477737
## # ... with 45 more rows

Linear Models

mod <- lm(deaths_per_thou ~ cases_per_thou, data = US_state_totals)
summary(mod)
## 
## Call:
## lm(formula = deaths_per_thou ~ cases_per_thou, data = US_state_totals)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.41872 -0.21743 -0.02948  0.20909  1.04943 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -0.013040   0.212328  -0.061    0.951    
## cases_per_thou  0.016812   0.002123   7.918 1.51e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4657 on 53 degrees of freedom
## Multiple R-squared:  0.5419, Adjusted R-squared:  0.5333 
## F-statistic:  62.7 on 1 and 53 DF,  p-value: 1.511e-10
mod2 <- lm(deaths_per_thou ~ cases_per_thou + population, data = US_state_totals)
summary(mod2)
## 
## Call:
## lm(formula = deaths_per_thou ~ cases_per_thou + population, data = US_state_totals)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.35691 -0.25883 -0.00448  0.22692  1.01691 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -5.254e-02  2.094e-01  -0.251    0.803    
## cases_per_thou  1.626e-02  2.106e-03   7.719 3.55e-10 ***
## population      1.530e-08  8.686e-09   1.762    0.084 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4567 on 52 degrees of freedom
## Multiple R-squared:  0.5677, Adjusted R-squared:  0.5511 
## F-statistic: 34.15 on 2 and 52 DF,  p-value: 3.386e-10
anova(mod, mod2)
## Analysis of Variance Table
## 
## Model 1: deaths_per_thou ~ cases_per_thou
## Model 2: deaths_per_thou ~ cases_per_thou + population
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1     53 11.493                              
## 2     52 10.845  1    0.6475 3.1045 0.08395 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
US_state_totals %>%
  slice_min(cases_per_thou)
## # A tibble: 1 x 6
##   Province_State          deaths cases population cases_per_thou deaths_per_thou
##   <chr>                    <dbl> <dbl>      <dbl>          <dbl>           <dbl>
## 1 Northern Mariana Islan~      2   183      55144           3.32          0.0363
US_state_totals %>%
  slice_max(cases_per_thou)
## # A tibble: 1 x 6
##   Province_State deaths  cases population cases_per_thou deaths_per_thou
##   <chr>           <dbl>  <dbl>      <dbl>          <dbl>           <dbl>
## 1 North Dakota     1554 110567     762062           145.            2.04
US_state_totals %>%
  mutate(pred = predict(mod))
## # A tibble: 55 x 7
##    Province_State  deaths  cases population cases_per_thou deaths_per_thou  pred
##    <chr>            <dbl>  <dbl>      <dbl>          <dbl>           <dbl> <dbl>
##  1 Alabama          11306 5.49e5    4903185          112.            2.31   1.87
##  2 Alaska             373 7.09e4     740995           95.6           0.503  1.60
##  3 Arizona          17843 8.90e5    7278717          122.            2.45   2.04
##  4 Arkansas          5874 3.46e5    3017804          115.            1.95   1.91
##  5 California       63335 3.81e6   39512223           96.4           1.60   1.61
##  6 Colorado          6732 5.54e5    5758736           96.2           1.17   1.60
##  7 Connecticut       8270 3.49e5    3565287           97.8           2.32   1.63
##  8 Delaware          1683 1.10e5     973764          112.            1.73   1.88
##  9 District of Co~   1141 4.92e4     705749           69.8           1.62   1.16
## 10 Florida          37555 2.35e6   21477737          110.            1.75   1.83
## # ... with 45 more rows
US_total_w_pred <- US_state_totals %>%
  mutate(pred = predict(mod), pred2 = predict(mod2), std_ratio = ((deaths_per_thou / cases_per_thou) - (mean(deaths_per_thou) / mean(cases_per_thou))) / sd(deaths_per_thou / cases_per_thou))
US_total_w_pred
## # A tibble: 55 x 9
##    Province_State  deaths  cases population cases_per_thou deaths_per_thou  pred
##    <chr>            <dbl>  <dbl>      <dbl>          <dbl>           <dbl> <dbl>
##  1 Alabama          11306 5.49e5    4903185          112.            2.31   1.87
##  2 Alaska             373 7.09e4     740995           95.6           0.503  1.60
##  3 Arizona          17843 8.90e5    7278717          122.            2.45   2.04
##  4 Arkansas          5874 3.46e5    3017804          115.            1.95   1.91
##  5 California       63335 3.81e6   39512223           96.4           1.60   1.61
##  6 Colorado          6732 5.54e5    5758736           96.2           1.17   1.60
##  7 Connecticut       8270 3.49e5    3565287           97.8           2.32   1.63
##  8 Delaware          1683 1.10e5     973764          112.            1.73   1.88
##  9 District of Co~   1141 4.92e4     705749           69.8           1.62   1.16
## 10 Florida          37555 2.35e6   21477737          110.            1.75   1.83
## # ... with 45 more rows, and 2 more variables: pred2 <dbl>, std_ratio <dbl>
US_total_w_pred %>%
  ggplot() +
  geom_point(aes(x = cases_per_thou, y = deaths_per_thou), color = "blue") +
  geom_point(aes(x = cases_per_thou, y = pred), color = "red") +
  geom_point(aes(x = cases_per_thou, y = pred2), color = "green")

We can create a linear regression model with deaths_per_thou as the response and cases_per_thou as the sole predictor. The F-statistic for the model is statistically significant at the alpha level of 0.05, so we can conclude that the model is a better fit to the data than the null model would be. The p-value for the predictor is also significant at the alpha level of 0.05, indicating that it should be included in the model. Using the linear model, we can make predictions for the fitted values of the response for different values of the predictor. By juxtaposing the fitted values (in red) and the actual values (in blue), it is evident that while there is certainly a linear trend in the actual data, a large portion of the variability in the response is unexplained by our simple linear regression model. This can also be seen analytically by examining the adjusted \(R^2\) value, which is ~0.53.

Creating a second linear model and adding in a second predictor, population, could potentially help model the response more accurately. While this model does have a lower residual sum of squares (RSS) than the one predictor model, it is known that adding predictors will always decrease the RSS. To check if this new model is better, we perform an F-test on the two models using the anova function. Since the F-statistic of 3.1013 yields a p-value of 0.084, we fail to reject the null hypothesis that the two models are statistically different at the alpha = 0.05 level. When two models offer similar explanatory power, we opt to select the more parsimonious model to eschew needless complexity. This means we should select the single predictor model as the better model of the two.

Plotting the second model’s fitted values (in green) against the actual response values (still in blue) graphically looks like a much better model fit. While the second model is statistically better at even the alpha = 0.1 level, it would be unethical to select or change the critical level after running analysis; the pre-selected alpha value must be maintained to preserve scientific integrity.

Interactive Graph

fig <- plot_ly(US_total_w_pred, x = ~cases_per_thou, y = ~deaths_per_thou, text = ~Province_State, type = 'scatter', mode = 'markers', color = ~population,
        marker = list(opacity = 0.7))
fig <- fig %>% layout(title = 'COVID-19 Cases and Deaths by US State',
         xaxis = list(showgrid = FALSE),
         yaxis = list(showgrid = FALSE)) #%>%
        #add_trace(
          #type = 'scatter',
          #hovertemplate = paste('<i>Cases Per Thou</i>: %{x:.2f}',
                          #'<br><i>Deaths Per Thou</i>: %{y:.2f}<br>',
                          #'<b>%{:Country_Region}</b><extra></extra>')
        #)

fig

To get a better picture of what the data for each individual state looks like, we can use plotly to get hovertext to show up over each data point. This enables us to see the name of the state, the cases_per_thou, the deaths_per_thou, and the population can be estimated by the colorscale shown in the legend. Interestingly, most of the states with both the lowest and the highest cases_per_thou are low in total population. The data suggests that there are some other important factors at play that are not fully encapsulated by the dataset.

Chloropleths

US_chloropleth <- US_state_totals %>% 
  mutate(state = Province_State)

plot_usmap(data = US_chloropleth, 
           values = "deaths_per_thou") + 
    scale_fill_gradient(name = "Deaths per thousand",
                      low = "green", high = "red") +
  theme(legend.position = "right") +
  labs(title = "COVID-19 Deaths Per Thousand People in the US")

plot_usmap(data = US_chloropleth, 
           values = "cases_per_thou") + 
    scale_fill_gradient(name = "Cases per thousand",
                      low = "green", high = "red") +
  theme(legend.position = "right") +
  labs(title = "COVID-19 Cases Per Thousand People in the US")

One question of interest is what is the effect of geographic location on the severity of COVID-19 cases and deaths? Are there clusters of states that have similar COVID-19 rates? To check on this, we can construct a chloropleth for both cases_per_thou and deaths_per_thou to see if there are any pictorially discernible patterns. The cases_per_thou chloropleth shows two clusters in the Northwest and the Northeast with a low number cases and then one cluster in the upper Midwest with a high number of cases. The deaths_per_thou chloropleth shows low death clusters in the same places again, but the highest death cluster is in the lower Northeast this time and the second highest death cluster is the entire line of Southern states. Both chloropleths strongly suggest that the geographical aspect of the COVID-19 spread is relevant to some degree.

Conclusion and Sources of Bias

COVID-19 is a very serious pandemic and the more we can understand about how and why it spreads, the better we can fight against its proliferation. The collected data gives us some insight on how the population of an area and the number of cases per thousand citizens can effect the number of deaths per thousand citizens for that area. I chose to focus my statistical modeling on the US data and used linear regression models to procure predictions for the deaths per thousand for each state. However, the linear models available to explain the response given the predictors available in the data could be drastically improved. For the purposes of doing a deeper analysis, a few predictors that could be a boon to be able to add to the models at the state level would be population density, political view, lockdown restrictions, testing rate, daily vaccinations, and temperature.

One large potential source of bias in the data collection is how the states elect to report confirmed cases and deaths. Depending on their individual infrastructure for data collection, reports could be slow, incomplete, or misrecorded. Another important factor that can bias the data of the COVID-19 cases would be the rate at which individuals are getting tested, both asymptomatically and after they feel ill. The true population parameter of the infected people could be dramatically different if sick individuals are getting officially recorded in the system, leading to a significant bias in the direction of underestimation. A source of potential personal bias is that I chose to focus on the US data for modeling and largely ignored the global data, since I am from the United States. One final bias source is the alpha level of 0.05 that I chose before starting to analyze; I picked the default value because I do not have knowledge about virology so it is possible that the typical value for this field of study should be different. There could have been different conclusion to draw when examining a more eclectic source of COVID-19 data that I missed out on from analyzing from a domestic point of view. The data shows that COVID-19 is still very potent but on the decline, so hopefully by continuing to employ safety measures and distributing vaccines the brunt of the pandemic will soon be behind us.

Reproducibility Info

sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 8.1 x64 (build 9600)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] usmap_0.5.2      plotly_4.9.3     leaps_3.1        lubridate_1.7.10
##  [5] forcats_0.5.1    stringr_1.4.0    dplyr_1.0.5      purrr_0.3.4     
##  [9] readr_1.4.0      tidyr_1.1.3      tibble_3.1.1     ggplot2_3.3.3   
## [13] tidyverse_1.3.1 
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.6        assertthat_0.2.1  digest_0.6.27     utf8_1.2.1       
##  [5] R6_2.5.0          cellranger_1.1.0  backports_1.2.1   reprex_2.0.0     
##  [9] evaluate_0.14     httr_1.4.2        highr_0.9         pillar_1.6.0     
## [13] rlang_0.4.10      lazyeval_0.2.2    curl_4.3          readxl_1.3.1     
## [17] rstudioapi_0.13   data.table_1.14.0 jquerylib_0.1.3   rmarkdown_2.7    
## [21] labeling_0.4.2    htmlwidgets_1.5.3 munsell_0.5.0     broom_0.7.6      
## [25] compiler_3.6.1    modelr_0.1.8      xfun_0.22         pkgconfig_2.0.3  
## [29] htmltools_0.5.1.1 tidyselect_1.1.0  fansi_0.4.2       viridisLite_0.4.0
## [33] crayon_1.4.1      dbplyr_2.1.1      withr_2.4.2       grid_3.6.1       
## [37] jsonlite_1.7.2    gtable_0.3.0      lifecycle_1.0.0   DBI_1.1.1        
## [41] magrittr_2.0.1    scales_1.1.1      cli_2.4.0         stringi_1.5.3    
## [45] farver_2.1.0      fs_1.5.0          xml2_1.3.2        bslib_0.2.4      
## [49] ellipsis_0.3.1    generics_0.1.0    vctrs_0.3.7       tools_3.6.1      
## [53] glue_1.4.2        crosstalk_1.1.1   hms_1.0.0         yaml_2.2.1       
## [57] colorspace_2.0-0  rvest_1.0.0       knitr_1.33        haven_2.4.0      
## [61] sass_0.3.1